home *** CD-ROM | disk | FTP | other *** search
- %PAGESIZE 59 ; Turbo assembler formatting codes
- %BIN 13
- %LINUM 3
-
- ; Copyright (c) 1993 Colin Plumb. This code may be freely
- ; distributed under the terms of the GNU General Public Licence.
- ; Internet <colin@nyx.cs.du.edu>
-
- .model large
- .code
-
- ; A core operation in IDEA is multiplication modulo 65537.
- ; The valid inputs, 1 through 66636 inclusive are represented in
- ; 16-bit registers modulo 65536. I.e. a value of 0 means 65536,
- ; or -1. Thus, we need to test for that specially. -x, modulo
- ; 65537, is 65537-x = 1-x.
- ; For any other number, represent the product as a*65536+b. Since
- ; 65536 = -1 (mod 65537), this is the same number as b-a. Should
- ; this result be negautive (generate a borrow), -n mod 65537 = 1-n
- ; mod 65536. Or in other words, if you add the borrow bit back on,
- ; you get the right answer.
-
- ; This is what the assembly code does. It forms a zero, and adds
- ; that on with carry.
-
- ; Another useful optimisation takes advantage of the fact that
- ; a and b are equal only if the answer is congruent to 0 mod 65537.
- ; Since 65537 is prime, this happens only if one of the inputs is
- ; congruent to 0 mod 65537. Since the inputs are all less than 65537,
- ; this means it must have been zero.
-
- ; The code below tests for a zero result of the subtraction, and if
- ; one arises, it branches out of line to figure out what happened.
-
-
- ; This code implemets the IDEA encryption algorithm.
- ; It follows in pseudo-C, where the * operator operates
- ; modulo 65537, as Idea needs. (If you don't understand,
- ; learn IDEA better.)
-
- ; IDEA is works on 16-bit units. If you're processing bytes,
- ; it's defined to be big-endian, so an Intel machine needs to
- ; swap the bytes around.
-
- ; void Idea(u_int16 *in, u_int16 *out, u_int16 *key)
- ; {
- ; register u+int16 x0, x1, x2, x3, s1, s2, round;
- ;
- ; x0 = *in++; x1 = *in++; x2 = *in++; x3 = *in;
- ;
- ; for (round = 0; round < 8; round++) {
- ; x0 *= *key++;
- ; x1 += *key++;
- ; x2 += *key++;
- ; x3 *= *key++;
- ;
- ; s1 = x1; s2 = x2;
- ; x2 ^= x0; x1 ^= x3;
- ;
- ; x2 *= *key++;
- ; x1 += x2;
- ; x1 *= *key++;
- ; x2 += x1;
- ;
- ; x0 ^= x1; x3 ^= x2;
- ; x1 ^= s2; x2 ^= s1;
- ; }
- ; *out++ = x0 * *key++;
- ; *out++ = x2 + *key++; /* Yes, this is x2, not x1 */
- ; *out++ = x1 + *key++;
- ; *out = x3 * *key;
- ; }
-
- ; ds:si points to key, ax, dx are temps, args in bx, cx, di, bp
- ; Trashes *all* registers. direction flag must be clear.
- ; Leaves es zero.
-
- ; Since there is no spare register to hold the loop count, I make
- ; clever use of the stack, pushing the start of the loop several
- ; times and using a ret instruction to do the return.
-
- ; Annoyingly, lods is fastest on 8086's, but other techniques are
- ; best on 386's. Well, that's what the manual says, but real
- ; life is different. USELODS wins on a 386SX, at least.
- ; Leave it set for all platforms.
-
- USELODS equ 1
-
- ; bp must be x0 for some of the code below to work
- x0 equ bp
- x1 equ bx
- x2 equ cx
- x3 equ di
- ; di must be x3 for some of the code below to work
-
- ;; Now, this is rather interesting. We test for zero arguments
- ;; after the multiply. Assuming random inputs, one or both are
- ;; zero (2^17-1)/2^32, or approximately 1/32786 of the time.
- ;; Encryption in any feedback mode produces essentially random
- ;; inputs, so average-case analysis is okay. While we don't
- ;; want the out-of-line code to waste time, it is not worth
- ;; slowing down the in-line case to speed it up.
- ;;
- ;; Basically, we start inverting the source x, and if that was 0,
- ;; we use the inverse of the key instead.
-
- Core1Z:
- neg x0
- jnz Core1Za
- if USELODS
- sub x0,[si-2]
- else
- sub x0,[si]
- endif
- Core1Za:
- inc x0
- jmp Core1done
- Core2Z:
- neg x3
- jnz Core2Za
- if USELODS
- sub x3,[si-2]
- else
- sub x3,[si+6]
- endif
- Core2Za:
- inc x3
- jmp Core2done
- Core3Z:
- neg x2
- jnz Core3Za
- if USELODS
- sub x2,[si-2]
- else
- sub x2,[si+8]
- endif
- Core3Za:
- inc x2
- jmp Core3done
- Core4Z:
- neg x1
- jnz Core4Za
- if USELODS
- sub x1,[si-2]
- else
- sub x1,[si+10]
- endif
- Core4Za:
- inc x1
- jmp Core4done
-
- ; We need a constant 0 that we can move into a register without affecting
- ; the carry flag (as the classic xor ax,ax is wont to do), so we use the
- ; es register for a constant 0 source. This is okay even in protected
- ; mode. (I *told* you this was tricky code!)
-
- ; BTW, since you wanted to know, this is 8 + 78*4 + 16 = 336 instructions.
-
- Core proc near
- xor ax,ax
- mov es,ax
- mov ax,OFFSET Finish
- push ax
- mov ax,OFFSET Coreloop
- push ax ; Loop 3 times, then return
- push ax
- push ax
-
- Coreloop:
- if USELODS
- lodsw
- else
- mov ax,[si] ; x0 *= *key++
- endif
- mul x0
- sub ax,dx
- jz Core1Z
- mov x0,es
- adc x0,ax
- Core1done:
-
- if USELODS
- lodsw
- add x1,ax
- lodsw
- add x2,ax
- else
- add x1,[si+2] ; x1 += *key++
- add x2,[si+4] ; x2 += *key++
- endif
-
- if USELODS
- lodsw
- else
- mov ax,[si+6] ; x3 += *key++
- endif
- mul x3
- sub ax,dx
- jz Core2Z
- mov x3,es
- adc x3,ax
- Core2done:
-
- push x1 ; s1 = x1
- push x2 ; s2 = x2
-
- xor x1,x3 ; x1 ^= x3
- xor x2,x0 ; x2 ^= x0
-
- if USELODS
- lodsw
- else
- mov ax,[si+8] ; x2 *= *key++
- endif
- mul x2
- sub ax,dx
- jz Core3Z
- mov x2,es
- adc x2,ax
- Core3done:
-
- add x1,x2 ; x1 += x2
-
- if USELODS
- lodsw
- else
- mov ax,[si+10] ; x1 *= *key++
- endif
- mul x1
- sub ax,dx
- jz Core4Z
- mov x1,es
- adc x1,ax
- Core4done:
-
- add x2,x1 ; x2 += x1
-
- xor x0,x1 ; x0 ^= x1
- xor x3,x2 ; x3 ^= x2
-
- pop dx
- xor x1,dx ; x1 ^= s2
- pop dx
- xor x2,dx ; x2 ^= s1
-
- ; Second unrolling of loop
- if USELODS
- lodsw
- else
- mov ax,[si+12] ; x0 *= *key++
- endif
- mul x0
- sub ax,dx
- jz Core5Z
- mov x0,es
- adc x0,ax
- Core5done:
-
- if USELODS
- lodsw
- add x1,ax
- lodsw
- add x2,ax
- else
- add x1,[si+14] ; x1 += *key++
- add x2,[si+16] ; x2 += *key++
- endif
-
- if USELODS
- lodsw
- else
- mov ax,[si+18] ; x3 *= *key++
- endif
- mul x3
- sub ax,dx
- jz Core6Z
- mov x3,es
- adc x3,ax
- Core6done:
-
- push x1 ; s1 = x1
- push x2 ; s2 = x2
-
- xor x1,x3 ; x1 ^= x3
- xor x2,x0 ; x2 ^= x0
-
- if USELODS
- lodsw
- else
- mov ax,[si+20] ; x2 *= *key++
- endif
- mul x2
- sub ax,dx
- jz Core7Z
- mov x2,es
- adc x2,ax
- Core7done:
-
- add x1,x2 ; x1 += x2
-
- if USELODS
- lodsw
- else
- mov ax,[si+22] ; x1 *= *key++
- endif
- mul x1
- sub ax,dx
- jz Core8Z
- mov x1,es
- adc x1,ax
- Core8done:
-
- add x2,x1 ; x2 += x1
-
- xor x0,x1 ; x0 ^= x1
- xor x3,x2 ; x3 ^= x2
-
- pop dx
- xor x1,dx ; x1 ^= s2
- pop dx
- xor x2,dx ; x2 ^= s1
-
- ife USELODS
- lea si,[si+24]
- endif
-
- ret ; Used as a loop instruction!
-
- Core5Z:
- neg x0
- jnz Core5Za
- if USELODS
- sub x0,[si-2]
- else
- sub x0,[si+12]
- endif
- Core5Za:
- inc x0
- jmp Core5done
- Core6Z:
- neg x3
- jnz Core6Za
- if USELODS
- sub x3,[si-2]
- else
- sub x3,[si+18]
- endif
- Core6Za:
- inc x3
- jmp Core6done
- Core7Z:
- neg x2
- jnz Core7Za
- if USELODS
- sub x2,[si-2]
- else
- sub x2,[si+20]
- endif
- Core7Za:
- inc x2
- jmp Core7done
- Core8Z:
- neg x1
- jnz Core8Za
- if USELODS
- sub x1,[si-2]
- else
- sub x1,[si+22]
- endif
- Core8Za:
- inc x1
- jmp Core8done
- Core9Z:
- neg x0
- jnz Core9Za
- if USELODS
- sub x0,[si-2]
- else
- sub x0,[si]
- endif
- Core9Za:
- inc x0
- jmp Core9done
- ; Special: compute into dx (zero on entry)
- Core10Z:
- sub dx,x3
- jnz Core10Za
- if USELODS
- sub dx,[si-2]
- else
- sub dx,[si+6]
- endif
- Core10Za:
- inc dx
- ; jmp Core10done
- ret
-
-
- Finish:
- if USELODS
- lodsw
- else
- mov ax,[si] ; x0 *= *key++
- endif
- mul x0
- sub ax,dx
- jz Core9Z
- mov x0,es
- adc x0,ax
- Core9done:
-
- xchg x1,x2
- if USELODS
- lodsw
- add x1,ax
- lodsw
- add x2,ax
- else
- add x1,[si+2] ; x1 += *key++
- add x2,[si+4] ; x2 += *key++
- endif
-
- ; This is special: compute into dx, not x3
- if USELODS
- lodsw
- else
- mov ax,[si+6] ; x3 *= *key++
- endif
- mul x3
- sub ax,dx
- mov dx,es
- jz Core10Z
- adc dx,ax
- Core10done:
-
- ret
-
- endp
-
-
- ; Args are in, out, key
- public _Idea2
- _Idea2 proc far
- cld
- push bp ; Args start at [bp+6]
- mov bp,sp
- push si
- push di
- push ds ; 6 more words here, so args are at [sp+12]
- lds si,[bp+6] ; in
- lodsw
- xchg ah,al
- mov dx,ax
- lodsw
- xchg ah,al
- mov x1,ax
- lodsw
- xchg ah,al
- mov x2,ax
- lodsw
- xchg ah,al
- mov x3,ax
- lds si,[bp+14] ; key
-
- mov x0,dx
-
- call Core
-
- mov ax,x0
- mov bp,sp
- les di,[bp+16]
- xchg ah,al
- stosw
- mov ax,x1
- xchg ah,al
- stosw
- mov ax,x2
- xchg ah,al
- stosw
- mov ax,dx
- xchg ah,al
- stosw
-
- pop ds
- pop di
- pop si
- pop bp
-
- ret
-
- endp
-
- ; Okay, the basic plan for the CFB kernel is
- ; get x0,x1,x2,x3
- ; get key pointer
- ; call core
- ; get buffer pointers
- ;Loop:
- ; lodsw
- ; xor ax,x0
- ; mov x0,ax
- ; stosw
- ; lodsw
- ; xor ax,x1
- ; mov x0,ax
- ; stosw
- ; lodsw
- ; xor ax,x2
- ; mov x0,ax
- ; stosw
- ; lodsw
- ; xor ax,x3
- ; mov x3,ax
- ; stosw
- ; push buffer pointers
- ; get key pointer
- ; call core
- ; pop buffer pointers
- ; loop
- ; lodsw/xor/etc.
- ;
- ;
- ; This function is designed to go in the middle of a byte-granularity
- ; CFB engine. It performs "len" encryptions of the IV, encrypting
- ; 8*(len-1) bytes from the source to the destination. The idea is
- ; that you first xor any odd leading bytes, then call this function,
- ; then xor up to 8 trailing bytes.
-
- ; The main loop in this is 38 instructions, plus the 336 for the core
- ; makes 374 total. That's 46.75 instructions per byte.
- ; (It's the same for IdeaCFBx)
-
- ; IV, key, plain, cipher, len
- public _IdeaCFB
- _IdeaCFB proc far ; Args are at [sp+4]
- cld
- push bp
- push si
- push di
- push ds ; 8 more words here, so args are at [sp+12]
- ; To be precise, IV is at 12, key at 16, plain at 20,
- ; cipher at 24 and len at 28
- mov bp,sp
- lds si,[bp+12] ; IV
- ; Load and byte-swap IV
- mov ax,[si]
- xchg ah,al
- mov x1,[si+2]
- mov x2,[si+4]
- xchg bh,bl
- xchg ch,cl
- mov dx,[si+6]
- xchg dh,dl
-
- lds si,[bp+16] ; Key
- mov x0,ax
- mov x3,dx
-
- call Core
- IdeaCFBLoop:
- ; mov ax,x0
- ; mov bp,sp
- ; dec WORD PTR [bp+28] ; Decrement count
- ; jz IdeaCFBEnd
- ; lds si,[bp+20]
- ; les di,[bp+24]
- ; mov x0,ax
- ; Alternate code: (which is faster? Two moves or three segment overrides?)
- mov si,sp
- dec WORD PTR ss:[si+28]
- jz IdeaCFBEnd
- les di,ss:[si+24]
- lds si,ss:[si+20]
-
- lodsw
- xchg ah,al
- xor ax,x0
- mov x0,ax
- xchg ah,al
- stosw
- lodsw
- xchg ah,al
- xor ax,x1
- mov x1,ax
- xchg ah,al
- stosw
- lodsw
- xchg ah,al
- xor ax,x2
- mov x2,ax
- xchg ah,al
- stosw
- lodsw
- xchg ah,al
- xor ax,dx
- mov dx,ax
- xchg ah,al
- stosw
-
- ; mov ax,x0
- ; mov bp,sp
- ; mov [bp+20],si ; Save source offset
- ; mov [bp+24],di ; Save destination offset
- ; lds si,[bp+16] ; Key
- ; mov x0,ax ; Get x0 in place for another iteration
- ; Alternate code for the above: (which is faster? One move or three ss:?)
- mov ax,si
- mov si,sp
- mov ss:[si+20],ax
- mov ss:[si+24],di
- lds si,ss:[si+16]
-
- mov x3,dx ; Get x3 in place
- mov ax,OFFSET IdeaCFBLoop
- push ax
- jmp Core
-
- IdeaCFBEnd:
- ; lds si,[bp+12]
- lds di,ss:[si+12] ; Get IV for writing back
-
- mov ax,x0
- xchg ah,al
- mov [di],ax ; Use stosw?
- xchg bh,bl
- xchg ch,cl
- mov [di+2],x1
- mov [di+4],x2
- xchg dh,dl
- mov [di+6],dx
-
- pop ds
- pop di
- pop si
- pop bp
-
- ret
-
- endp
-
- ; This decoding step is similar, except that instead of
- ; lods
- ; xor x0,ax
- ; mov ax,x0
- ; stos
- ; the feedback step is
- ; lods
- ; xchg x0,ax
- ; xor ax,x0
- ; stos
-
- ; IV, key, cipher, plain, len
- public _IdeaCFBx
- _IdeaCFBx proc far ; Args are at [sp+4]
- cld
- push bp
- push si
- push di
- push ds ; 8 more words here, so args are at [sp+12]
- mov bp,sp
- lds si,[bp+12] ; IV
- ; Load and byte-swap IV
- mov ax,[si]
- xchg ah,al
- mov x1,[si+2]
- mov x2,[si+4]
- xchg bh,bl
- xchg ch,cl
- mov dx,[si+6]
- xchg dh,dl
-
- lds si,[bp+16] ; Key
- mov x0,ax
- mov x3,dx
-
- call Core
- IdeaCFBxLoop:
- ; mov ax,x0
- ; mov bp,sp
- ; dec WORD PTR [bp+28] ; Decrement count
- ; jz IdeaCFBxEnd
- ; lds si,[bp+20]
- ; les di,[bp+24]
- ; mov x0,ax
- ; Alternate code: (which is faster? Two moves or three segment overrides)
- mov si,sp
- dec WORD PTR ss:[si+28]
- jz IdeaCFBxEnd
- les di,ss:[si+24]
- lds si,ss:[si+20]
-
- lodsw
- xchg ah,al
- xchg x0,ax
- xor ax,x0
- xchg ah,al
- stosw
- lodsw
- xchg ah,al
- xchg x1,ax
- xor ax,x1
- xchg ah,al
- stosw
- lodsw
- xchg ah,al
- xchg x2,ax
- xor ax,x2
- xchg ah,al
- stosw
- lodsw
- xchg ah,al
- xchg dx,ax
- xor ax,dx
- xchg ah,al
- stosw
-
- ; mov ax,x0
- ; mov bp,sp
- ; mov [bp+20],si ; Save source offset
- ; mov [bp+24],di ; Save destination offset
- ; lds si,[bp+16] ; Key
- ; mov x0,ax ; Get x0 in place for another iteration
- ; Alternate code for the above: (which is faster? One move or three ss:?)
- mov ax,si
- mov si,sp
- mov ss:[si+20],ax
- mov ss:[si+24],di
- lds si,ss:[si+16]
-
- mov x3,dx ; Get x3 in place
- mov ax,OFFSET IdeaCFBxLoop
- push ax
- jmp Core
-
- IdeaCFBxEnd:
- ; lds si:[bp+12]
- lds di,ss:[si+12] ; Get IV for writing back
-
- mov ax,x0
- xchg ah,al
- mov [di],ax ; Use stosw?
- xchg bh,bl
- xchg ch,cl
- mov [di+2],x1
- mov [di+4],x2
- xchg dh,dl
- mov [di+6],dx
-
-
- pop ds
- pop di
- pop si
- pop bp
-
- ret
-
- endp
-
-
-
-
- end
-